import urllib.request
from bs4 import BeautifulSoup


class Scraper:
    """ 爬取指定网站中a标签里href属性包含html的内容"""

    def __init__(self, site):
        self.site = site

    def scrape(self):
        response = urllib.request.urlopen(self.site)
        html = response.read()
        soup = BeautifulSoup(html, "html.parser")
        with open("baidu_scrape.txt", "w") as f:

            for tag in soup.find_all("a"):
                url = tag.get("href")
                if url and "html" in url:
                    print("\n" + url)
                    f.write("\n" + url)

Scraper('https://news.baidu.com/').scrape()
